import jsonimport geopandas as gpdimport pandas as pdimport numpy as npimport foliumfrom folium.plugins import DualMap# libraries for machine learning model and folium mapfrom sklearn.model_selection import train_test_split# from sklearn.preprocessing import OneHotEncoderfrom xgboost import XGBRegressorfrom sklearn.metrics import mean_squared_error, mean_absolute_error, r2_scorefrom folium.features import GeoJson, GeoJsonTooltip
Home Prices in Texas
What is the average home price by Zipcode in Texas?
__
Show the code
# Here I am reading in my datadf = pd.read_csv("./ml_datasets/US Homes Data (For Sale Properties).csv")# I filter to only keep data from the state of Texas, specifically for homes that are for saledf = df[(df["state"].isin(["TX"])) & (df["property_status"].isin(["FOR_SALE"])) & (~df["living_space"].isin([0,1,2,3]))]# I only look at apartments, condo, manufactured, multi_family, single_family, and townhouse homesdf = df[~df['property_type'].isin(['LOT'])]# I filter out the rows with negative landscape valuesdf = df[~df['land_space'].isin([-10890.0])]# Here I change acres to square feet in the land_space columndf['land_space'] =round(df.apply(lambda row: row['land_space'] *43560if row['land_space_unit'] =='acres'else row['land_space'], axis=1),2)# Here I make the price_per_unit column more accuratedf['price_per_unit'] =round(df['price'] / df['living_space'], 2)df['price_per_unit'] = df.apply(lambda row: round(row['price'] / row['living_space'], 2) if (row['living_space'] >0) andnot pd.isna(row['living_space']) else row['price_per_unit'], axis=1)# Here I drop the extra columns that I do not need, and can drop nowdf = df.drop(['property_url','property_id', 'address', 'street_name', 'apartment', 'city', 'state','land_space_unit', 'broker_id', 'property_status','year_build', 'total_num_units' , 'listing_age', 'RunDate', 'agency_name', 'agent_name', 'agent_phone', 'latitude', 'longitude','price_per_unit'], axis=1).reset_index(drop=True)# One-hot encodingdf = pd.get_dummies(df, columns=['property_type'], dtype='int')# Here I change the name of my postcode column to zip_codedf.rename(columns={'postcode': 'zip_code'}, inplace=True)
Show the code
# Machine learning preparation.# Turning the city column to a categorical type# This is important for XGBoost to handle the data correctly# df['city'] = df['city'].astype('category')# ml_df = df.copy()# Load the GeoJSON shape file for Texas ZIP codesurl ="https://raw.githubusercontent.com/OpenDataDE/State-zip-code-GeoJSON/refs/heads/master/tx_texas_zip_codes_geo.min.json"zip_shapes = gpd.read_file(url)# Make sure ZIP codes are strings (important for matching)df['zip_code'] = df['zip_code'].astype(str)zip_shapes['ZCTA5CE10'] = zip_shapes['ZCTA5CE10'].astype(str)# Here I do a left join (everything from the ml_df and only matching rows from zip_shapes)new_df = pd.merge(df,zip_shapes, left_on='zip_code', right_on='ZCTA5CE10', how='left')# Here I drop the extra columns that I do not needml_clean = new_df.drop(['STATEFP10', 'ZCTA5CE10', 'GEOID10','CLASSFP10', 'MTFCC10', 'FUNCSTAT10', 'ALAND10','AWATER10', 'INTPTLAT10', 'INTPTLON10', 'PARTFLG10'], axis=1)# Tell GeoPandas which column holds the geometryml_clean = gpd.GeoDataFrame(ml_clean, geometry='geometry')# Check the current coordinate reference system (CRS)# print(ml_clean.crs)# Now I can safely extract spatial features# Extract spatial features# Calculate centroid coordinates in metersml_clean['zip_centroid_lon'] = ml_clean.geometry.centroid.xml_clean['zip_centroid_lat'] = ml_clean.geometry.centroid.y# Here I drop the geometry column (not needed for modeling) and city column (one-hot encoding and label encoding are not ideal for this column)ml_clean = ml_clean.drop(columns=['geometry'])
Show the code
# Drop zipcode for model trainingX = ml_clean.drop(columns=['price'])y = ml_clean['price']# Here I make the target variable a log to improve my machine learning model performancey = np.log1p(y)X_train_full, X_test_full, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)# Here I save the zipcodes separately before I drop them for modeling# Zipcodes aligned to my training setzip_train = X_train_full['zip_code'].reset_index(drop=True)# Zipcodes aligned to my test setzip_test = X_test_full['zip_code'].reset_index(drop=True)# Here I remove the zipcodes from the actual training dataX_train = X_train_full.drop(columns=['zip_code'])X_test = X_test_full.drop(columns=['zip_code'])# create model instancexgb = XGBRegressor(n_estimators=350, max_depth=10, learning_rate=.01, random_state=42)# fit modelxgb.fit(X_train, y_train)# make predictionsy_pred = xgb.predict(X_test)print("Mean Absolute Error (MAE):", mean_absolute_error(y_test, y_pred))print("Mean Squared Error (MSE):", mean_squared_error(y_test, y_pred))rmse = np.sqrt(mean_squared_error(y_test, y_pred))print("Root Mean Squared Error (RMSE):", rmse)print("R² Score:", r2_score(y_test, y_pred))# # undo the log1p transformation to get the actual price predictions# y_pred = np.expm1(y_pred)# y_test = np.expm1(y_test)results = pd.DataFrame({'zip_code': zip_test,'prediction': y_pred,'actual_price': y_test.reset_index(drop=True)})# results# Here I group by zip_code column and calculate the mean of the prediction and actual valuesresults = results.groupby('zip_code')[['prediction', 'actual_price']].mean()# Round to two decimal placesresults['actual_price'] = results['actual_price'].round(2)# Here I change the name of my predicition column and actual price columnresults = results.rename(columns={'prediction': 'average_price_prediction_log', 'actual_price': 'actual_average_price_log'})results = pd.DataFrame(results).reset_index()# undo the log1p transformation to get the actual price predictions in different columnsresults['average_price_prediction'] = np.expm1(results['average_price_prediction_log'])results['actual_average_price'] = np.expm1(results['actual_average_price_log'])# Here I do a left join (everything from the ml_df and only matching rows from zip_shapes)results = pd.merge(results,zip_shapes, left_on='zip_code', right_on='ZCTA5CE10', how='left')# Here I drop the extra columns that I do not needresults = results.drop(['STATEFP10', 'ZCTA5CE10', 'GEOID10','CLASSFP10', 'MTFCC10', 'FUNCSTAT10', 'ALAND10','AWATER10', 'INTPTLAT10', 'INTPTLON10', 'PARTFLG10'], axis=1)# Tell GeoPandas which column holds the geometryresults = gpd.GeoDataFrame(results, geometry='geometry')# Simplify geometry to reduce file size (tolerance controls precision)# Tolerance controls how much simplification is applied. Higher values = more simplification (more vertices removed).# Lower values = more detailed shape retained. In this case, 0.01 is a relatively small simplification, meaning fine detail is mostly preserved.# preserve_topology=True ensures that the simplified geometry does not become invalid. Prevents things like self-intersecting polygons, holes merging or disappearing incorrectly, borders of neighboring shapes overlapping or separating when they shouldn’t.results['geometry'] = results['geometry'].simplify(tolerance=0.01, preserve_topology=True)# results
Mean Absolute Error (MAE): 0.22063838742237502
Mean Squared Error (MSE): 0.1207861593836633
Root Mean Squared Error (RMSE): 0.34754303241996276
R² Score: 0.8472178918654716
Show the code
# The actual heat map# geojson_data = tacos.__geo_interface__ # convert GeoDataFrame to GeoJSON# Create a map centered over Texas# width="35%", height="70%"m1 = folium.Map(location=[31.9686, -99.9018], zoom_start=5.25, tiles="openstreetmap")# Add the choropleth (color-coded layer)folium.Choropleth(# This is the GeoJSON data that contains the shapes of the ZIP code areas. geo_data=results,# This is the data that I want to visualize. data=results,# Here I specify what columns to use (the column that ties to the geographic shapes and the column with the data to visualize).# The first column is the ZIP code, and the second column is the average price. columns=['zip_code', 'average_price_prediction_log'],# The key_on value should be a string that represents the path in the GeoJSON structure to the property (key) that holds the value you want to match with the data DataFrame. key_on='feature.properties.zip_code',# This sets the color scale used to fill each area. fill_color='plasma',# This sets the transparency of the filled areas. fill_opacity=0.7,# This sets the transparency of the boundary lines between the geographic shapes. line_opacity=0.45, legend_name="Average Price Prediction (Log Scale- Smallest to Largest)").add_to(m1)tooltip = GeoJson( results, style_function=lambda x: {# Don't add a fill color (we're already coloring with the choropleth)'fillColor': 'transparent',# Hide the border line color'color': 'transparent',# No border line thickness'weight': 0 }, tooltip=GeoJsonTooltip(# Columns to display in the tooltip fields=['zip_code', 'average_price_prediction'],# What to display as labels for the fields (instead of the column names) aliases=['ZIP Code:', 'Average Price Prediction:'],# Formats numbers using local formatting (e.g. commas in large numbers) localize=True,# Tooltip "sticks" to your mouse as you move around that shape. Nice UX sticky=True,# Shows the field names (the aliases you defined). labels=True )).add_to(m1)# The actual heat map# geojson_data = tacos.__geo_interface__ # convert GeoDataFrame to GeoJSON# Create a map centered over Texas# width="35%", height="70%"m2 = folium.Map(location=[31.9686, -99.9018], zoom_start=5.25, tiles="openstreetmap")# Add the choropleth (color-coded layer)folium.Choropleth(# This is the GeoJSON data that contains the shapes of the ZIP code areas. geo_data=results,# This is the data that I want to visualize. data=results,# Here I specify what columns to use (the column that ties to the geographic shapes and the column with the data to visualize).# The first column is the ZIP code, and the second column is the average price. columns=['zip_code', 'actual_average_price_log'],# The key_on value should be a string that represents the path in the GeoJSON structure to the property (key) that holds the value you want to match with the data DataFrame. key_on='feature.properties.zip_code',# This sets the color scale used to fill each area. fill_color='plasma',# This sets the transparency of the filled areas. fill_opacity=0.7,# This sets the transparency of the boundary lines between the geographic shapes. line_opacity=0.45, legend_name="Actual Average Price (Log Scale - Smallest to Largest)").add_to(m2)tooltip = GeoJson( results, style_function=lambda x: {# Don't add a fill color (we're already coloring with the choropleth)'fillColor': 'transparent',# Hide the border line color'color': 'transparent',# No border line thickness'weight': 0 }, tooltip=GeoJsonTooltip(# Columns to display in the tooltip fields=['zip_code', 'actual_average_price'],# What to display as labels for the fields (instead of the column names) aliases=['ZIP Code:', 'Actual Average Price:'],# Formats numbers using local formatting (e.g. commas in large numbers) localize=True,# Tooltip "sticks" to your mouse as you move around that shape. Nice UX sticky=True,# Shows the field names (the aliases you defined). labels=True )).add_to(m2)# Save the maps as separate HTML filesm1.save("ml_datasets/predicted_price_map.html")m2.save("ml_datasets/actual_price_map.html")
Show the code
# Display the combined maps (e.g., in a Jupyter Notebook or save to HTML file)from IPython.display import HTML# Embed maps side-by-side in HTMLhtml_string =""" <div style="display: flex;"> <div style="width: 50%; height: 100%;"> <h3>Average Price Prediction (Log Scale- Smallest to Largest)</h3>{m1} </div> <div style="width: 50%; height: 100%;"> <h3>Actual Average Price (Log Scale - Smallest to Largest)</h3>{m2} </div> </div>""".format(m1=m1._repr_html_(), m2=m2._repr_html_())HTML(html_string)
Average Price Prediction (Log Scale- Smallest to Largest)
Make this Notebook Trusted to load map: File -> Trust Notebook
Actual Average Price (Log Scale - Smallest to Largest)
Make this Notebook Trusted to load map: File -> Trust Notebook